df = read.csv("/Users/esterperdochova/Desktop/STAT DS/Bike-Sharing-Dataset/hour.csv")
df
df2 = df[df$season %in% c(2,3), ]
df2 = df2[df2$workingday == 1, ]
df2
df2 %>%
mutate(hrFct = fct_rev(as.factor(hr))) %>%
ggplot(aes(y = hrFct)) +
geom_density_ridges(
aes(x = cnt, fill = paste(hrFct, season)),
alpha = .8, color = "white"#, from = 0, to = 100
) +
labs(
x = "Count",
y = "Hour",
title = "Distribution of cnt w.r.t. time and season."
) +
scale_y_discrete(expand = c(0, 0)) +
scale_x_continuous(expand = c(0, 0)) +
scale_fill_cyclical(
breaks = c("Spring", "Summer"),
labels = c(`Spring` = "Spring", `Summer` = "Summer"),
values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"),
name = "Season", guide = "legend"
) +
coord_cartesian(clip = "off") +
theme_ridges(grid = FALSE)
## Picking joint bandwidth of 25.3
-> Let’s find highest running X hour count
query = "
SELECT hr
, season
, SUM(cnt) cnt
FROM df2
GROUP BY hr
, season
"
(df_hour_season = sqldf(query))
peak_finder = function(df, roll_length) {
temp_list = c()
for (i in seq(0, 23)) {
temp_range = seq(i, i+roll_length-1) %% 24
temp_count = sum(df[df$hr %in% temp_range, ]$cnt)
temp_list = c(temp_list, temp_count)
}
temp_loc = which.max(temp_list) + 1
peak_range = seq(temp_loc, temp_loc+roll_length-1) %% 24
return(peak_range)
}
# For spring
(peak_spring = peak_finder(df_hour_season[df_hour_season$season == '2', ], 4))
## [1] 18 19 20 21
# For summer
(peak_summer = peak_finder(df_hour_season[df_hour_season$season == '3', ], 4))
## [1] 18 19 20 21
# Combo:
(peak_combo = peak_finder(df_hour_season, 4))
## [1] 18 19 20 21
df2[df2$hr %in% peak_spring, ]
df2[df2$hr %in% peak_spring, ] %>%
mutate(hrFct = fct_rev(as.factor(hr))) %>%
ggplot(aes(y = hrFct)) +
geom_density_ridges(
aes(x = cnt, fill = paste(hrFct, season)),
alpha = .8, color = "white"#, from = 0, to = 100
) +
labs(
x = "Count",
y = "Hour",
title = "Distribution of cnt w.r.t. time and season."
) +
scale_y_discrete(expand = c(0, 0)) +
scale_x_continuous(expand = c(0, 0)) +
scale_fill_cyclical(
breaks = c("Spring", "Summer"),
labels = c(`Spring` = "Spring", `Summer` = "Summer"),
values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"),
name = "Season", guide = "legend"
) +
coord_cartesian(clip = "off") +
theme_ridges(grid = FALSE)
## Picking joint bandwidth of 42.5
visualise_df = function(df, name) {
name = gsub(" ", "\n", name)
n_breaks = max(min(nrow(df)/200, 40), 20)
temp_dens = density(df$cnt, adjust = 2)
temp_dens_log = density(log(df$cnt), adjust = 2)
hist(df$cnt,
col = "floralwhite",
breaks = n_breaks,
prob = TRUE,
xlab = "Cnt",
main = "Hist.: Cnt")
lines(temp_dens,
col = "firebrick4")
legend("topright", legend = name, bty = "n")
hist(log(df$cnt),
col = "floralwhite",
breaks = n_breaks,
prob = TRUE,
xlab = "ln(Cnt)",
main = "Hist.: ln(Cnt)")
lines(temp_dens_log,
col = "firebrick4")
legend("topright", legend = name, bty = "n")
}
# For spring:
df_spring = df2[df2$season == 2, ]
df_spring = df_spring[df_spring$hr %in% peak_spring, ]
shapiro.test(df_spring$cnt)
##
## Shapiro-Wilk normality test
##
## data: df_spring$cnt
## W = 0.96189, p-value = 3.009e-10
descdist(df_spring$cnt, boot = 1000)
## summary statistics
## ------
## min: 22 max: 868
## median: 318.5
## mean: 351.2266
## estimated sd: 186.9774
## estimated skewness: 0.630501
## estimated kurtosis: 2.857972
(dist_object = fitdist(df_spring$cnt, "norm"))
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 351.2266 8.255288
## sd 186.7947 5.837337
plot(dist_object)
# For summer:
df_summer = df2[df2$season == 3, ]
df_summer = df_summer[df_summer$hr %in% peak_summer, ]
shapiro.test(df_summer$cnt)
##
## Shapiro-Wilk normality test
##
## data: df_summer$cnt
## W = 0.95379, p-value = 9.655e-12
descdist(df_summer$cnt, boot = 1000)
## summary statistics
## ------
## min: 38 max: 977
## median: 381
## mean: 418.0401
## estimated sd: 190.0553
## estimated skewness: 0.7097991
## estimated kurtosis: 2.991612
(dist_object = fitdist(df_summer$cnt, "norm"))
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 418.0401 8.294648
## sd 189.8739 5.865202
plot(dist_object)
par(mar = c(4, 3, 3, 3), mgp = c(2, 0.5,0))
graphics::layout(mat = matrix(1:4, nrow = 2, ncol = 2, byrow = F),
heights = 1, widths = 1)
visualise_df(df_spring, "Spring")
visualise_df(df_summer, "Summer")
ks.test(df_spring$cnt, df_summer$cnt)
## Warning in ks.test.default(df_spring$cnt, df_summer$cnt): p-value will be
## approximate in the presence of ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: df_spring$cnt and df_summer$cnt
## D = 0.16761, p-value = 9.591e-07
## alternative hypothesis: two-sided
df3 = df2
df3$y_bool = df3$season - 2
df3 = df3[, -3]
df3$dteday = as.numeric(as.Date(df3$dteday))
mod1 = glm(y_bool ~ ., data = df3, family = binomial(link = 'logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod1)
##
## Call:
## glm(formula = y_bool ~ ., family = binomial(link = "logit"),
## data = df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.971e-04 -2.000e-08 2.000e-08 2.000e-08 4.937e-04
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.398e+07 4.739e+08 0.051 0.960
## instant 6.750e+01 1.330e+03 0.051 0.960
## dteday -1.601e+03 3.164e+04 -0.051 0.960
## yr -4.966e+03 9.478e+04 -0.052 0.958
## mnth -1.640e+02 4.299e+03 -0.038 0.970
## hr -6.752e+01 1.332e+03 -0.051 0.960
## holiday NA NA NA NA
## weekday 1.438e+01 3.724e+02 0.039 0.969
## workingday NA NA NA NA
## weathersit 2.671e-01 5.447e+02 0.000 1.000
## temp 1.790e+01 2.198e+04 0.001 0.999
## atemp 1.192e+01 2.053e+04 0.001 1.000
## hum 9.989e+00 6.216e+03 0.002 0.999
## windspeed 3.538e+00 4.769e+03 0.001 0.999
## casual -2.598e-02 2.916e+01 -0.001 0.999
## registered 2.792e-03 3.285e+00 0.001 0.999
## cnt NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8.6038e+03 on 6206 degrees of freedom
## Residual deviance: 8.7607e-06 on 6193 degrees of freedom
## AIC: 28
##
## Number of Fisher Scoring iterations: 25
df3 = as.matrix(df3)
bst = xgboost(df3[, -17], df3[, 17], nrounds = 50,
eta = 0.2, max_depth = 5, subsample = .5, verbose = 0)
xgb.plot.shap(df3[, -17], model = bst, top_n = 4, n_col = 2)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.97
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.03
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.1209
xgb.ggplot.shap.summary(df3[, -17], model = bst)